#!/usr/bin/env python
# coding: utf-8
# download_rockland_raw_bids_ver2.py
#
# Modified by: Kefin Sajan 2023
# Based on Ver1 by: Daniel Clark, John Pellman 2015/2016, Carlos G.Candano 2017/2018, Yiwen Tian 2021

# Import packages
import pandas
import boto3
import botocore
import csv
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# For anonymous access to the bucket.
from botocore import UNSIGNED
from botocore.client import Config
from botocore.handlers import disable_signing


'''
This script downloads data from the NKI Rockland Public releases
stored in the cloud in BIDS format.  You can specify sex, age range, handedness,
session, scan type (anatomical, functional, dwi) and to limit your download to a subset of the sample.
If no options are specified, all available files are downloaded.
Use the '-h' to get more information about command line usage.
'''


SESSIONS = ['BAS1','BAS2','FLU1','TRT','FLU2']


SCANS = ['anat', 'func', 'dwi']


SERIES_MAP={'CHECKERBOARD1400':'task-CHECKERBOARD_acq-1400',
            'CHECKERBOARD645':'task-CHECKERBOARD_acq-645',
            'RESTCAP':'task-rest_acq-CAP',
            'REST1400':'task-rest_acq-1400',
            'BREATHHOLD1400':'task-BREATHHOLD_acq-1400',
            'REST645':'task-rest_acq-645',
            'RESTPCASL':'task-rest_pCASL'}


def files(client, bucket, prefix=''):
    """ Return the path to the participants.tsv file in the bucket """
    paginator = client.get_paginator('list_objects')
    for result in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='participants.tsv'):
        for prefix in result.get('CommonPrefixes', []):
            yield prefix.get('Prefix')


def generate_Subfolders(s3_client):
    """ Generates a list of the contents specified in the prefix """
    gen_subfolders = files(s3_client, 'fcp-indi', prefix=('data/Projects/RocklandSample/RawDataBIDSLatest/'))#+extra
    genSubfoldersList=list(gen_subfolders)
    print(genSubfoldersList)
    return genSubfoldersList


# Main collect and download function
def collect_and_download(out_dir,
                         aws_links=0,
                         less_than=0, greater_than=0, sex='', handedness='',
                         sessions=SESSIONS,
                         scans=SCANS,
                         series=SERIES_MAP.keys(),
                         derivatives=False,
                         dryrun=False):
    '''
    Function to collect and download images from the Rockland sample 
    directory on FCP-INDI's S3 bucket
    Parameters
    ----------
    out_dir : string
        filepath to a local directory to save files to
    aws_links : string
        filepath of aws_links.csv from http://fcon_1000.projects.nitrc.org/indi/enhanced/aws_links.csv
    less_than : float
        upper age (years) threshold for participants of interest
    greater_than : float
        lower age (years) threshold for participants of interest
    sex : string
        'M' or 'F' to indicate whether to download male or female data
    handedness : string
        'R' or 'L' to indicate whether to download right-handed or
        left-handed participants
    sessions : list
        the session names (e.g.,'BAS1','FLU1')
    scan : list
        the scan types to download.  Can be 'anat','func' or 'dwi'.
    series : list
        the series to download (for functional scans)
    derivatives : boolean
        whether or not to download data derivatives for functional scans
    dryrun : boolean
        whether or not to perform a dry run (i.e., no actual downloads,
        just listing files that would be downloaded)
    Returns
    -------
    boolean
        Returns true if the download was successful, false otherwise.
    '''

    # Init variables
    s3_bucket_name = 'fcp-indi'
    s3_prefix = 'data/Projects/RocklandSample/RawDataBIDSLatest'

    # Fetch bucket
    s3 = boto3.resource('s3')
    s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
    #s3_bucket = s3.Bucket(s3_bucket_name)

    # Remove series that aren't in the series map keys.
    series = [ s for s in series if s in SERIES_MAP.keys()]
    
    # Generate a list of series to filter on. (e.g. task-CHECKERBOARD..)
    series_filt = [SERIES_MAP[s] for s in series]

    # If output path doesn't exist, create it
    if not os.path.exists(out_dir) and not dryrun:
        print ('Could not find %s, creating now...' % out_dir)
        os.makedirs(out_dir)

    if os.path.exists(aws_links):
        participants_df = pandas.read_csv(aws_links, na_values=['n/a'])
        #participants_df = list(csv.reader(aws_links, delimiter=","))
        
    else:
        #print('Downloading aws_links.csv from http://fcon_1000.projects.nitrc.org/indi/enhanced......')
        os.system('wget http://fcon_1000.projects.nitrc.org/indi/enhanced/aws_links.csv .')
        print('Saved to current folder')
        #participants_df = pandas.read_csv('aws_links.csv', delimiter=",", na_values=['n/a'])
        participants_df = list(csv.reader(aws_links, delimiter=","))
    
    print(participants_df)
    s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    print ('Collecting images of interest...')
    # Remove the participant rows from whose age range, handedness and sex do not conform to the criteria.
    if less_than:
        participants_df = participants_df[participants_df['age'] < less_than]
    if greater_than:
        participants_df = participants_df[participants_df['age'] > greater_than]
    if sex == 'M':
        participants_df = participants_df[participants_df['gender'] == 'M']
    elif sex == 'F':
        participants_df = participants_df[participants_df['gender'] == 'F']
    if handedness == 'R':
        participants_df = participants_df[participants_df['handedness'] == 'RIGHT']
    elif handedness == 'L':
        participants_df = participants_df[participants_df['handedness'] == 'LEFT']
    participants_df = participants_df[participants_df['session'].isin(sessions)]
    participants_df = participants_df[participants_df['filepath'].str.contains('|'.join(scans))]
    #participants_df = participants_df[participants_df['filepath'].contains('|'.join(scans))]
    participants_df = participants_df[(participants_df['filepath'].str.contains('|'.join(series_filt))==True)|(participants_df['filepath'].str.contains('func')==False)]
    #participants_df = participants_df[(participants_df['filepath'].contains('|'.join(series_filt))==True)|(participants_df['filepath'].contains('func')==False)]

    print("Passed participants_df")

    if len(participants_df) == 0:
        print ('No participants meet the criteria given.  No download will be initiated.')
        return

    # Generate a list of participants, single column list to filter on.
    participants_filt = ['sub-' + label + '/' for label in participants_df['subject'].tolist()]
    participants_filt = list(set(participants_filt))

    s3_keylist = list(participants_df['filepath'])
    s3_keylist = list(set(s3_keylist))


    print("Passed s3_keylist")
    
    # Append as well the dataset_desc json path to the end of s3_keylist
    s3_keylist.append('/'.join(['s3://fcp-indi', s3_prefix, 'dataset_description.json']))
    # Verify that the participants Datframe Has Only the Subjects that appear in the s3_keylist
    # NOT all participants have all the Scan Types.     
    #newParticipantsDf=pandas.DataFrame(columns=participants_df.columns)
    newParticipantsDf=pandas.DataFrame(participants_df)
    #df = pd.DataFrame(L)
    #print(participants_df)
    print(newParticipantsDf)
    for row in s3_keylist:
        rowDivided=row.split('/')
        #Get only the rows with sub-A000
        justTheSub=[x for x in rowDivided if x.startswith("sub-A") and len(x)==13]
        print((justTheSub))

        if len(justTheSub)>0: 
            # Remove the sub- part
            ursi=justTheSub[0][4:]
            particRow=participants_df[participants_df.iloc[:,0].str.contains(ursi)]
            #particRow=participants_df[participants_df.iloc[:,0].contains(ursi)]

            #Build the dataframe and remove repeated rows
            #newParticipantsDf=pandas.DataFrame.append(newParticipantsDf,particRow)
            #newParticipantsDf=pandas.concat(newParticipantsDf,particRow)   
            newParticipantsDf=pandas.concat([newParticipantsDf,particRow])            
         
            newParticipantsDf=pandas.DataFrame.drop_duplicates(newParticipantsDf)
    participants_df = newParticipantsDf
    
    print(participants_df)
    print("Passed merging to ParticipantsDf")
    
    # Re-create the participants list after verifying the particpants that have 
    #The correct scan types. This corss checking with s3_keylist
    participants_filt = ['sub-' + label + '/' for label in participants_df['subject'].tolist()]
    # And download the items. All the items are the Total number of rows in s3_keylist
    total_num_files = len(s3_keylist)
    files_downloaded = len(s3_keylist)
    # For each of the paths and each index in s3_keylist
    for path_idx, s3_path in enumerate(s3_keylist):
        s3_path = s3_path.replace('s3://fcp-indi/','')
        print (s3_path)
        # Remove the string /data/Projects/RocklandSample/RawDataBIDSLatest for each path in list
        rel_path = s3_path.replace(s3_prefix, '')
        # Remove the FIRST slash from string
        rel_path = rel_path.lstrip('/')
        # Create a location path  for the folder and file path
        download_file = os.path.join(out_dir, rel_path)
        download_dir = os.path.dirname(download_file)
        # Create the folder in the path specified
        if not os.path.exists(download_dir) and not dryrun:
            os.makedirs(download_dir)
        try:
            if not os.path.exists(download_file):
                # Dryrun will not download the files
                if dryrun:
                    print ('Would download to: %s' % download_file)
                else:
                    print ('Downloading to: %s' % download_file)
                    # Download the files in the just created folder
                    with open(download_file, 'wb') as f:
                        s3_client.download_fileobj(s3_bucket_name, s3_path, f)
                    print ('%.3f%% percent complete' %                           (100*(float(path_idx+1)/total_num_files)))
                    #print('File '+float(path_idx+1)+'out of '+total_num_files)
            else:
                print ('File {} already exists, skipping...'.format(download_file))
                files_downloaded -= 1
        except Exception as exc:
            print ('There was a problem downloading %s.\n'                  'Check input arguments and try again.' % s3_path)
            print (exc)
    
    print("Passed files_downloaded")
    
    if dryrun:
        print ('%d files would be downloaded for %d participant(s).' % (files_downloaded,len(participants_df)))
    else:
        print ('%d files downloaded for %d participant(s).' % (files_downloaded,len(participants_df)))
    if not dryrun:
        print ('Saving out revised participants.tsv and session tsv files.')
        # Save out revised participants.tsv to output directory, if a participants.tsv already exists, open it and append it to the new one.
        if os.path.isfile(os.path.join(out_dir, 'participants.tsv')):
            old_participants_df = pandas.read_csv(os.path.join(out_dir, 'participants.tsv'), delimiter='\t', na_values=['n/a', 'N/A'])
            participants_df = participants_df.append(old_participants_df, ignore_index=True)
            participants_df.drop_duplicates(inplace=True)
            os.remove(os.path.join(out_dir, 'participants.tsv'))
        participants_df.to_csv(os.path.join(out_dir, 'participants.tsv'), sep="\t", na_rep="n/a", index=False)
    print ('Done!')


# Make module executable
if __name__ == '__main__':
    # Import packages
    import argparse
    import sys
    import os
    
    # Init arparser
    parser = argparse.ArgumentParser(description=__doc__)
    
    # Required arguments
    parser.add_argument('-o', '--out_dir', required=True, type=str,
                        help='Path to local folder to download files to')

    # Optional arguments
    parser.add_argument('-al', '--aws_links', required=False,
                       type=str, help='Path to aws_links.csv. Leave it empty '\
                        'to let the script search and download to current directory')
    parser.add_argument('-lt', '--less_than', required=False,
                        type=float, help='Upper age threshold (in years) of '\
                                         'particpants to download (e.g. for '\
                                         'subjects 30 or younger, \'-lt 31\')')
    parser.add_argument('-gt', '--greater_than', required=False,
                        type=float, help='Lower age threshold (in years) of '\
                                       'particpants to download (e.g. for '\
                                       'subjects 31 or older, \'-gt 30\')')
    parser.add_argument('-x', '--sex', required=False, type=str,
                        help='Participant sex of interest to download only '\
                             '(e.g. \'M\' or \'F\')')
    parser.add_argument('-m', '--handedness', required=False, type=str,
                        help='Participant handedness to download only '\
                             '(e.g. \'R\' or \'L\')')
    parser.add_argument('-v', '--sessions', required=False, nargs='*', type=str,
                        help='A space-separated list of session (visit) codes '\
                             'to download (e.g. \'NFB3\',\'CLG2\')')
    parser.add_argument('-t', '--scans', required=False, nargs='*', type=str,
                        help='A space-separated list of scan types '\
                             'to download (e.g. \'anat\',\'dwi\')')
    parser.add_argument('-e', '--series', required=False, nargs='*', type=str,
                        help='A space-separated list of series codes '\
                             'to download (e.g. \'DMNTRACKINGTRAIN\',\'DMNTRACKINGTEST\')')
    parser.add_argument('-d', '--derivatives', required=False, action='store_true',
                        help='Download derivatives (despiked physio, masks) in addition to raw data?')
    parser.add_argument('-n', '--dryrun', required=False, action='store_true',
                        help='Perform a dry run to see how many files would be downloaded.')

    # Parse and gather arguments
    args = parser.parse_args()
    #########
    '''
    class Namespace:
        def __init__(self, **kwargs):
            self.__dict__.update(kwargs)
    args=Namespace(out_dir='/home/cgutierrez/RunScripts/down_rock', less_than=7, greater_than=None,sex='M', handedness=None,sessions=None, scans=['func'], series=["CHECKERBOARD645","REST1400"], dryrun=False, derivatives=None)     
    ##########
    '''
    
    # Init variables
    out_dir = os.path.abspath(args.out_dir)
    kwargs = {}
    if args.aws_links:
        kwargs['aws_links'] = args.aws_links
    elif os.path.exists('aws_links.csv'):
        print ('Found aws_links.csv in current working directory.')
        kwargs['aws_links'] = 'aws_links.csv'
        
    if args.less_than:
        kwargs['less_than'] = args.less_than
        print ('Using upper age threshold of %d...' % kwargs['less_than'])
    else:
        print ('No upper age threshold specified')
    if args.greater_than:
        kwargs['greater_than'] = args.greater_than
        print ('Using lower age threshold of %d...' % kwargs['greater_than'])
    else:
        print ('No lower age threshold specified')
    if args.sex:
        kwargs['sex'] = args.sex.upper()
        if kwargs['sex'] == 'M':
            print ('Downloading only male participants...')
        elif kwargs['sex'] == 'F':
            print ('Downloading only female participants...')
        else:
            print ('Input for sex \'%s\' was not \'M\' or \'F\'.' % kwargs['sex'])
            print ('Please check script syntax and try again.')
            sys.exit(1)
    else:
        print ('No sex specified, using all sexes...')
    if args.handedness:
        kwargs['handedness'] = args.handedness.upper()
        if kwargs['handedness'] == 'R':
            print ('Downloading only right-handed participants...')
        elif kwargs['handedness'] == 'L':
            print ('Downloading only left-handed participants...')
        else:
            print ('Input for handedness \'%s\' was not \'L\' or \'R\'.' % kwargs['handedness'])
            print ('Please check script syntax and try again.')
            sys.exit(1)
    if args.sessions:
        kwargs['sessions'] = args.sessions
        for session in kwargs['sessions']:
            if session not in SESSIONS:
                print ('Session \'%s\' is not a valid session name.' % session)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Sessions to download: ' + ' '.join(kwargs['sessions']))
    if args.scans:
        kwargs['scans'] = args.scans
        for scan in kwargs['scans']:
            if scan not in SCANS:
                print ('Scan \'%s\' is not a valid scan name.' % scan)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Scans to download: ' + ' '.join(kwargs['scans']))
    if args.series:
        kwargs['series'] = args.series
        for series in kwargs['series']:
            if series not in SERIES_MAP.keys():
                print ('Series \'%s\' is not a valid series name.' % series)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Series to download: ' + ' '.join(kwargs['series']))
    if args.derivatives:
        kwargs['derivatives'] = args.derivatives
        print ('Data derivatives will be downloaded.')
    if args.dryrun:
        kwargs['dryrun'] = args.dryrun
        print ('Running download as a dry run.')

    # Call the collect and download routine
    collect_and_download(out_dir, **kwargs)
#/Users/ksajan/opt/anaconda3/bin/python /Users/ksajan/Downloads/webTest/download.py